﻿Imports System
Imports System.IO
Imports System.Collections.Generic
Imports System.Linq
Imports System.Text

Namespace CharsetDetector.SimpleHelpers
    Public Class FileEncoding
        Const DEFAULT_BUFFER_SIZE As Integer = 128 * 1024

        Public Shared Function DetectFileEncoding(ByVal inputFilename As String, ByVal Optional defaultIfNotDetected As Encoding = Nothing) As Encoding
            Using stream = New System.IO.FileStream(inputFilename, System.IO.FileMode.Open, System.IO.FileAccess.Read, System.IO.FileShare.ReadWrite Or System.IO.FileShare.Delete, DEFAULT_BUFFER_SIZE)
                Return If(DetectFileEncoding(stream), defaultIfNotDetected)
            End Using
        End Function

        Public Shared Function DetectFileEncoding(ByVal inputStream As Stream, ByVal Optional defaultIfNotDetected As Encoding = Nothing) As Encoding
            Dim det = New FileEncoding()
            det.Detect(inputStream)
            Return If(det.Complete(), defaultIfNotDetected)
        End Function

        Public Shared Function DetectFileEncoding(ByVal inputData As Byte(), ByVal start As Integer, ByVal count As Integer, ByVal Optional defaultIfNotDetected As Encoding = Nothing) As Encoding
            Dim det = New FileEncoding()
            det.Detect(inputData, start, count)
            Return If(det.Complete(), defaultIfNotDetected)
        End Function

        Public Shared Function TryLoadFile(ByVal filename As String, ByVal Optional defaultValue As String = "") As String
            Try

                If System.IO.File.Exists(filename) Then
                    Dim encoding = SimpleHelpers.FileEncoding.DetectFileEncoding(filename)
                    Return System.IO.File.ReadAllText(filename, encoding)
                End If

            Catch
            End Try

            Return defaultValue
        End Function

        Public Shared Function CheckForTextualData(ByVal rawData As Byte()) As Boolean
            Return CheckForTextualData(rawData, 0, rawData.Length)
        End Function

        Public Shared Function CheckForTextualData(ByVal rawData As Byte(), ByVal start As Integer, ByVal count As Integer) As Boolean
            If rawData.Length < count OrElse count < 4 OrElse start + 1 >= count Then Return True

            If CheckForByteOrderMark(rawData, start) Then
                Return True
            End If

            Dim nullSequences As Integer = 0
            Dim controlSequences As Integer = 0

            For i = start + 1 To count - 1

                If rawData(i - 1) = 0 AndAlso rawData(i) = 0 Then
                    If System.Threading.Interlocked.Increment(nullSequences) > 1 Then Exit For
                ElseIf rawData(i - 1) = 0 AndAlso rawData(i) < 10 Then
                    controlSequences += 1
                End If
            Next

            Return nullSequences = 0 AndAlso (controlSequences <= (rawData.Length / 10))
        End Function

        Private Shared Function CheckForByteOrderMark(ByVal rawData As Byte(), ByVal Optional start As Integer = 0) As Boolean
            If rawData.Length - start < 4 Then Return False

            If rawData(start) = &HEF AndAlso rawData(start + 1) = &HBB AndAlso rawData(start + 2) = &HBF Then
                Return True
            ElseIf rawData(start) = &HFE AndAlso rawData(start + 1) = &HFF Then
                Return True
            ElseIf rawData(start) = 0 AndAlso rawData(start + 1) = 0 AndAlso rawData(start + 2) = &HFE AndAlso rawData(start + 3) = &HFF Then
                Return True
            ElseIf rawData(start) = &H2B AndAlso rawData(start + 1) = &H2F AndAlso rawData(start + 2) = &H76 Then
                Return True
            End If

            Return False
        End Function

        Private ude As Ude.CharsetDetector = New Ude.CharsetDetector()
        Private _started As Boolean = False
        Public Property Done As Boolean
        Public Property EncodingName As String
        Public Property IsText As Boolean
        Public Property HasByteOrderMark As Boolean
        Private encodingFrequency As Dictionary(Of String, Integer) = New Dictionary(Of String, Integer)(StringComparer.Ordinal)

        Public Sub Reset()
            _started = False
            Done = False
            HasByteOrderMark = False
            encodingFrequency.Clear()
            ude.Reset()
            EncodingName = Nothing
        End Sub

        Public Function Detect(ByVal inputData As Stream) As String
            Return Detect(inputData, 20 * 1024 * 1024)
        End Function

        Public Function Detect(ByVal inputData As Stream, ByVal maxSize As Integer, ByVal Optional bufferSize As Integer = 16 * 1024) As String
            If bufferSize <= 0 Then Throw New ArgumentOutOfRangeException("bufferSize", "Buffer size cannot be 0 or less.")
            Dim maxIterations As Integer = If(maxSize <= 0, Int32.MaxValue, maxSize / bufferSize)
            Dim i As Integer = 0
            Dim buffer As Byte() = New Byte(bufferSize - 1) {}

            While Math.Min(System.Threading.Interlocked.Increment(i), i - 1) < maxIterations
                Dim sz As Integer = inputData.Read(buffer, 0, CInt(buffer.Length))

                If sz <= 0 Then
                    Exit While
                End If

                Detect(buffer, 0, sz)
                If Done Then Exit While
            End While

            Complete()
            Return EncodingName
        End Function

        Public Function Detect(ByVal inputData As Byte(), ByVal start As Integer, ByVal count As Integer) As String
            If Done Then Return EncodingName

            If Not _started Then
                Reset()
                _started = True

                If Not CheckForTextualData(inputData, start, count) Then
                    IsText = False
                    Done = True
                    Return EncodingName
                End If

                HasByteOrderMark = CheckForByteOrderMark(inputData, start)
                IsText = True
            End If

            ude.Feed(inputData, start, count)
            ude.DataEnd()

            If ude.IsDone() AndAlso Not String.IsNullOrEmpty(ude.Charset) Then
                IncrementFrequency(ude.Charset)
                Done = True
                Return EncodingName
            End If

            Dim singleUde = New Ude.CharsetDetector()
            Const udeFeedSize As Integer = 4 * 1024
            Dim [step] As Integer = If((count - start) < udeFeedSize, (count - start), udeFeedSize)
            Dim pos = start

            While pos < count
                singleUde.Reset()

                If pos + [step] > count Then
                    singleUde.Feed(inputData, pos, count - pos)
                Else
                    singleUde.Feed(inputData, pos, [step])
                End If

                singleUde.DataEnd()
                If singleUde.Confidence > 0.3 AndAlso Not String.IsNullOrEmpty(singleUde.Charset) Then IncrementFrequency(singleUde.Charset)
                pos += [step]
            End While

            EncodingName = GetCurrentEncoding()
            Return EncodingName
        End Function

        Public Function Complete() As Encoding
            Done = True
            ude.DataEnd()

            If ude.IsDone() AndAlso Not String.IsNullOrEmpty(ude.Charset) Then
                EncodingName = ude.Charset
            End If

            EncodingName = GetCurrentEncoding()
            If Not String.IsNullOrEmpty(EncodingName) Then Return Encoding.GetEncoding(EncodingName)
            Return Nothing
        End Function

        Private Sub IncrementFrequency(ByVal charset As String)
            Dim currentCount As Integer
            encodingFrequency.TryGetValue(charset, currentCount)
            encodingFrequency(charset) = System.Threading.Interlocked.Increment(currentCount)
        End Sub

        Private Function GetCurrentEncoding() As String
            If encodingFrequency.Count = 0 Then Return Nothing
            Return encodingFrequency.OrderByDescending(Function(i) i.Value * (If(i.Key <> ("ASCII"), 1, 0))).FirstOrDefault().Key
        End Function
    End Class
End Namespace